import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.pyplot import figure
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import xgboost as xgb
pd.set_option('display.max_columns', 50)
RFC_METRIC = 'gini' #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation
RANDOM_STATE = 2021
MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop
OPT_ROUNDS = 1000 #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result
IS_LOCAL = False
data_df = pd.read_csv('train.csv')
Look at the first 10 data features
data_df.head(10)
| state | account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | payment_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HI | 33 | area_code_415 | no | no | 0 | 200.5 | 117 | 34.09 | 159.9 | 111 | 13.59 | 196.2 | 84 | 8.83 | 16.3 | 6 | 4.40 | 3 | no |
| 1 | TN | 80 | area_code_415 | yes | no | 0 | 276.5 | 122 | 47.01 | 195.6 | 79 | 16.63 | 210.3 | 78 | 9.46 | 7.2 | 3 | 1.94 | 1 | yes |
| 2 | CT | 37 | area_code_408 | no | no | 0 | 134.9 | 98 | 22.93 | 248.4 | 130 | 21.11 | 236.2 | 113 | 10.63 | 14.7 | 2 | 3.97 | 3 | no |
| 3 | TN | 106 | area_code_415 | no | no | 0 | 119.2 | 142 | 20.26 | 228.4 | 139 | 19.41 | 197.9 | 61 | 8.91 | 8.4 | 9 | 2.27 | 2 | no |
| 4 | TX | 123 | area_code_408 | no | no | 0 | 260.9 | 85 | 44.35 | 168.5 | 103 | 14.32 | 178.3 | 91 | 8.02 | 13.3 | 5 | 3.59 | 3 | no |
| 5 | CT | 152 | area_code_408 | no | yes | 20 | 239.1 | 105 | 40.65 | 209.1 | 111 | 17.77 | 268.2 | 130 | 12.07 | 13.3 | 3 | 3.59 | 5 | no |
| 6 | NY | 87 | area_code_415 | no | no | 0 | 204.8 | 101 | 34.82 | 161.0 | 80 | 13.69 | 285.7 | 89 | 12.86 | 9.5 | 3 | 2.57 | 0 | no |
| 7 | UT | 110 | area_code_415 | no | no | 0 | 271.1 | 108 | 46.09 | 237.0 | 122 | 20.15 | 239.9 | 122 | 10.80 | 9.8 | 5 | 2.65 | 2 | yes |
| 8 | IL | 66 | area_code_415 | no | yes | 21 | 134.4 | 110 | 22.85 | 136.2 | 104 | 11.58 | 215.6 | 105 | 9.70 | 9.7 | 4 | 2.62 | 3 | no |
| 9 | MI | 119 | area_code_510 | yes | yes | 22 | 172.1 | 119 | 29.26 | 223.6 | 133 | 19.01 | 150.0 | 94 | 6.75 | 13.9 | 20 | 3.75 | 1 | yes |
Convert "yes" and "no" to integers 1 and 0
def yes2int(string):
if isinstance(string, str):
if string == "yes":
return 1
elif string == "no":
return 0
else:
return string
else:
return string
data_df = data_df.applymap(yes2int)
data_df.head(3)
| state | account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | payment_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HI | 33 | area_code_415 | 0 | 0 | 0 | 200.5 | 117 | 34.09 | 159.9 | 111 | 13.59 | 196.2 | 84 | 8.83 | 16.3 | 6 | 4.40 | 3 | 0 |
| 1 | TN | 80 | area_code_415 | 1 | 0 | 0 | 276.5 | 122 | 47.01 | 195.6 | 79 | 16.63 | 210.3 | 78 | 9.46 | 7.2 | 3 | 1.94 | 1 | 1 |
| 2 | CT | 37 | area_code_408 | 0 | 0 | 0 | 134.9 | 98 | 22.93 | 248.4 | 130 | 21.11 | 236.2 | 113 | 10.63 | 14.7 | 2 | 3.97 | 3 | 0 |
Let's have a more detailed look
data_df.describe()
| account_length | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | payment_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 | 3000.000000 |
| mean | 100.510000 | 0.093667 | 0.270333 | 7.977333 | 180.456533 | 100.228667 | 30.678167 | 200.034000 | 100.247333 | 17.003100 | 201.099633 | 99.939667 | 9.049577 | 10.206500 | 4.495333 | 2.756263 | 1.591667 | 0.137667 |
| std | 39.485782 | 0.291413 | 0.444206 | 13.730125 | 54.098371 | 19.763769 | 9.196706 | 50.127406 | 19.761638 | 4.260769 | 50.863770 | 19.669680 | 2.288908 | 2.749696 | 2.489051 | 0.742311 | 1.314346 | 0.344607 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 22.300000 | 12.000000 | 1.900000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 73.000000 | 0.000000 | 0.000000 | 0.000000 | 143.700000 | 87.000000 | 24.430000 | 165.800000 | 87.000000 | 14.090000 | 166.975000 | 87.000000 | 7.517500 | 8.500000 | 3.000000 | 2.300000 | 1.000000 | 0.000000 |
| 50% | 100.000000 | 0.000000 | 0.000000 | 0.000000 | 179.400000 | 101.000000 | 30.500000 | 199.900000 | 101.000000 | 16.990000 | 201.600000 | 100.000000 | 9.070000 | 10.300000 | 4.000000 | 2.780000 | 1.000000 | 0.000000 |
| 75% | 127.000000 | 0.000000 | 1.000000 | 18.000000 | 216.800000 | 113.000000 | 36.860000 | 233.700000 | 113.000000 | 19.860000 | 235.225000 | 113.000000 | 10.582500 | 12.000000 | 6.000000 | 3.240000 | 2.000000 | 0.000000 |
| max | 233.000000 | 1.000000 | 1.000000 | 52.000000 | 346.800000 | 165.000000 | 58.960000 | 363.700000 | 169.000000 | 30.910000 | 395.000000 | 175.000000 | 17.770000 | 20.000000 | 20.000000 | 5.400000 | 9.000000 | 1.000000 |
temp = data_df["payment_delay"].value_counts()
df = pd.DataFrame({'payment_delay': temp.index,'values': temp.values})
plt.figure(figsize = (6,6))
plt.title('Payment Delays Clients - target value - data unbalance\n (No delay = 0, Delay = 1)')
sns.set_color_codes("pastel")
sns.barplot(x = 'payment_delay', y="values", data=df)
locs, labels = plt.xticks()
plt.show()
We have 3000 distinct clients.
13,7% out of the dataset are delayed payments.
A min value of 1 for account length is unusual.
total = data_df.isnull().sum().sort_values(ascending = False)
percent = (data_df.isnull().sum()/data_df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()
| payment_delay | number_customer_service_calls | account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Total | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Percent | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
There is no missing data in the entire dataset.
Check for data corelation
features = data_df.drop(["state","account_length","area_code"], axis=1)
sns.pairplot(features)
plt.show()
We notice:
As a result we will not use total_day_minutes, total_eve_minutes, total_night_minutes and total_intl_minutes in order to reduce the model dimensionality.
Our data set now looks like this:
data_df.head(5)
| state | account_length | area_code | international_plan | voice_mail_plan | number_vmail_messages | total_day_minutes | total_day_calls | total_day_charge | total_eve_minutes | total_eve_calls | total_eve_charge | total_night_minutes | total_night_calls | total_night_charge | total_intl_minutes | total_intl_calls | total_intl_charge | number_customer_service_calls | payment_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HI | 33 | area_code_415 | 0 | 0 | 0 | 200.5 | 117 | 34.09 | 159.9 | 111 | 13.59 | 196.2 | 84 | 8.83 | 16.3 | 6 | 4.40 | 3 | 0 |
| 1 | TN | 80 | area_code_415 | 1 | 0 | 0 | 276.5 | 122 | 47.01 | 195.6 | 79 | 16.63 | 210.3 | 78 | 9.46 | 7.2 | 3 | 1.94 | 1 | 1 |
| 2 | CT | 37 | area_code_408 | 0 | 0 | 0 | 134.9 | 98 | 22.93 | 248.4 | 130 | 21.11 | 236.2 | 113 | 10.63 | 14.7 | 2 | 3.97 | 3 | 0 |
| 3 | TN | 106 | area_code_415 | 0 | 0 | 0 | 119.2 | 142 | 20.26 | 228.4 | 139 | 19.41 | 197.9 | 61 | 8.91 | 8.4 | 9 | 2.27 | 2 | 0 |
| 4 | TX | 123 | area_code_408 | 0 | 0 | 0 | 260.9 | 85 | 44.35 | 168.5 | 103 | 14.32 | 178.3 | 91 | 8.02 | 13.3 | 5 | 3.59 | 3 | 0 |
I have some suspicions regarding the account_length but plotting a histogram on the length of it does show that is fairly well distributed across the data set, so I will not remove any entries on this criteria.
hist, _ = np.histogram(data_df['account_length'], bins='auto')
plt.hist(hist, bins = len(hist))
plt.show()
Also the length does not seem to influence the outcome in any way, nor is corelated with th
We will assume that payment delays is not influenced by state, area code or account lenght
true_ip_true_pd = len(data_df[(data_df['international_plan'] == 1) & (data_df['payment_delay'] ==1) ])/len(data_df)
true_ip_false_pd = len(data_df[(data_df['international_plan'] == 1) & (data_df['payment_delay'] ==0) ])/len(data_df)
false_ip_true_pd = len(data_df[(data_df['international_plan'] == 0) & (data_df['payment_delay'] ==1) ])/len(data_df)
false_ip_false_pd = len(data_df[(data_df['international_plan'] == 0) & (data_df['payment_delay'] ==0) ])/len(data_df)
df = pd.DataFrame({"ratio":[true_ip_false_pd,true_ip_true_pd,false_ip_false_pd,false_ip_true_pd]},
index= ["ip - no pd","ip - pd","no ip - no pd","no ip - pd"])
plot = df.plot.pie(y='ratio', figsize=(15, 15), startangle=90, autopct='%1.1f%%')
print("Payment delays in case of international plan clients has a change of {}%".format(round(true_ip_true_pd/(true_ip_true_pd+true_ip_false_pd),4)*100))
print("Payment delays in case of no international plan clients has a change of {}%".format(round(false_ip_true_pd/(false_ip_true_pd+false_ip_false_pd),4)*100))
Payment delays in case of international plan clients has a change of 41.28% Payment delays in case of no international plan clients has a change of 10.92%
We can see that payment delays is more frequent if the person also has an International plan.
true_vp_true_pd = len(data_df[(data_df['voice_mail_plan'] == 1) & (data_df['payment_delay'] ==1) ])/len(data_df)
true_vp_false_pd = len(data_df[(data_df['voice_mail_plan'] == 1) & (data_df['payment_delay'] ==0) ])/len(data_df)
false_vp_true_pd = len(data_df[(data_df['voice_mail_plan'] == 0) & (data_df['payment_delay'] ==1) ])/len(data_df)
false_vp_false_pd = len(data_df[(data_df['voice_mail_plan'] == 0) & (data_df['payment_delay'] ==0) ])/len(data_df)
df = pd.DataFrame({"ratio":[true_vp_false_pd,true_vp_true_pd,false_vp_false_pd,false_vp_true_pd]},
index= ["vp - no pd","vp - pd","no vp - no pd","no vp - pd"])
plot = df.plot.pie(y='ratio', figsize=(15, 15), startangle=90, autopct='%1.1f%%')
print("Payment delays in case of voice mail plan clients has a change of {}%".format(round(true_vp_true_pd/(true_vp_true_pd+true_vp_false_pd),4)*100))
print("Payment delays in case of no voice mail plan clients has a change of {}%".format(round(false_vp_true_pd/(false_vp_true_pd+false_vp_false_pd),4)*100))
Payment delays in case of voice mail plan clients has a change of 7.64% Payment delays in case of no voice mail plan clients has a change of 16.03%
We can see that payment delays is more frequent if the person does not have a voice mail plan.
fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, figsize=(48,12))
s = sns.boxplot(ax = ax1, x="payment_delay", y="total_day_charge", hue="payment_delay",data=data_df, palette="PRGn",showfliers=False)
s = sns.boxplot(ax = ax2, x="payment_delay", y="total_eve_charge", hue="payment_delay",data=data_df, palette="PRGn",showfliers=False)
s = sns.boxplot(ax = ax3, x="payment_delay", y="total_night_charge", hue="payment_delay",data=data_df, palette="PRGn",showfliers=False)
s = sns.boxplot(ax = ax4, x="payment_delay", y="total_intl_charge", hue="payment_delay",data=data_df, palette="PRGn",showfliers=False)
plt.show();
It looks like payment delays are more frequent for users who use the voice plan more, regardless of the time of the day.
This difference is more noticeable in the case of day_charge.
target = 'payment_delay'
predictors = ['international_plan','voice_mail_plan','number_vmail_messages','total_day_calls','total_day_charge',
'total_eve_calls', 'total_eve_charge', 'total_night_calls', 'total_night_charge', 'total_intl_calls',
'total_intl_charge', 'number_customer_service_calls']
data_df[predictors]
| international_plan | voice_mail_plan | number_vmail_messages | total_day_calls | total_day_charge | total_eve_calls | total_eve_charge | total_night_calls | total_night_charge | total_intl_calls | total_intl_charge | number_customer_service_calls | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 117 | 34.09 | 111 | 13.59 | 84 | 8.83 | 6 | 4.40 | 3 |
| 1 | 1 | 0 | 0 | 122 | 47.01 | 79 | 16.63 | 78 | 9.46 | 3 | 1.94 | 1 |
| 2 | 0 | 0 | 0 | 98 | 22.93 | 130 | 21.11 | 113 | 10.63 | 2 | 3.97 | 3 |
| 3 | 0 | 0 | 0 | 142 | 20.26 | 139 | 19.41 | 61 | 8.91 | 9 | 2.27 | 2 |
| 4 | 0 | 0 | 0 | 85 | 44.35 | 103 | 14.32 | 91 | 8.02 | 5 | 3.59 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2995 | 0 | 0 | 0 | 103 | 31.11 | 77 | 18.26 | 73 | 9.29 | 6 | 2.35 | 2 |
| 2996 | 0 | 0 | 0 | 66 | 44.85 | 116 | 21.36 | 112 | 9.00 | 2 | 2.27 | 5 |
| 2997 | 0 | 0 | 0 | 83 | 25.21 | 79 | 15.44 | 104 | 7.00 | 6 | 2.24 | 3 |
| 2998 | 0 | 0 | 0 | 110 | 31.82 | 116 | 9.75 | 83 | 4.71 | 5 | 3.56 | 1 |
| 2999 | 1 | 0 | 0 | 99 | 42.08 | 118 | 9.22 | 72 | 10.44 | 3 | 2.86 | 2 |
3000 rows × 12 columns
train_df, val_df = train_test_split(data_df, test_size=VALID_SIZE, random_state=RANDOM_STATE, shuffle=True )
We also need to make copies of those splits for later usages
train_df_bkp = train_df.copy()
val_df_bkp = val_df.copy()
rfc_clf = RandomForestClassifier(n_jobs=NO_JOBS,
random_state=RANDOM_STATE,
criterion=RFC_METRIC,
n_estimators=NUM_ESTIMATORS,
verbose=False)
rfc_clf.fit(train_df[predictors], train_df[target].values)
RandomForestClassifier(n_jobs=4, random_state=2021, verbose=False)
preds = rfc_clf.predict(val_df[predictors])
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': rfc_clf.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
It looks total_day_charge is rated as the most important feature by Random Forest, followed by total_evening_charge and number_customer_service_calls.
cm = pd.crosstab(val_df[target].values, preds, rownames=['Actual'], colnames=['Predicted'])
fig, (ax1) = plt.subplots(ncols=1, figsize=(5,5))
sns.heatmap(cm,
xticklabels=['No Delay', 'Delay'],
yticklabels=['No Delay', 'Delay'],
annot=True,ax=ax1,
linewidths=.2,linecolor="Darkblue", cmap="Blues")
plt.title('Confusion Matrix', fontsize=14)
plt.show()
We also calculate area under curve (receiver operator characteristic)
roc_auc_score(val_df[target].values, preds)
0.8351217045229665
The ROC-AUC score obtained with RandomForrestClassifier is 0.835.
abc_clf = AdaBoostClassifier(random_state=RANDOM_STATE,
algorithm='SAMME.R',
learning_rate=0.8,
n_estimators=NUM_ESTIMATORS)
We fit the model
abc_clf.fit(train_df[predictors], train_df[target].values)
AdaBoostClassifier(learning_rate=0.8, n_estimators=100, random_state=2021)
preds = abc_clf.predict(val_df[predictors])
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': abc_clf.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
cm = pd.crosstab(val_df[target].values, preds, rownames=['Actual'], colnames=['Predicted'])
fig, (ax1) = plt.subplots(ncols=1, figsize=(5,5))
sns.heatmap(cm,
xticklabels=['No Delay', 'Delay'],
yticklabels=['No Delay', 'Delay'],
annot=True,ax=ax1,
linewidths=.2,linecolor="Darkblue", cmap="Blues")
plt.title('Confusion Matrix', fontsize=14)
plt.show()
roc_auc_score(val_df[target].values, preds)
0.6419006574462939
The ROC-AUC score obtained with AdaBoostClassifier is 0.641.
# Prepare the train and valid datasets
dtrain = xgb.DMatrix(train_df[predictors], train_df[target].values)
dvalid = xgb.DMatrix(val_df[predictors], val_df[target].values)
#What to monitor (in this case, **train** and **valid**)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.039
params['silent'] = True
params['max_depth'] = 2
params['subsample'] = 0.8
params['colsample_bytree'] = 0.9
params['eval_metric'] = 'auc'
params['random_state'] = RANDOM_STATE
Training phase
xgb_clf = xgb.train(params,
dtrain,
MAX_ROUNDS,
watchlist,
early_stopping_rounds=EARLY_STOP,
maximize=True,
verbose_eval=VERBOSE_EVAL)
[00:42:06] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:541:
Parameters: { silent } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-auc:0.70263 valid-auc:0.75218
[50] train-auc:0.88846 valid-auc:0.90875
[100] train-auc:0.90792 valid-auc:0.92357
[150] train-auc:0.92239 valid-auc:0.92847
[191] train-auc:0.92724 valid-auc:0.92927
preds = xgb_clf.predict(dvalid)
preds_confusion = preds
preds_confusion[preds_confusion<0.5]= 0
preds_confusion[preds_confusion>=0.5] = 1
fig, (ax) = plt.subplots(ncols=1, figsize=(8,5))
xgb.plot_importance(xgb_clf, height=0.8, title="Features importance (XGBoost)", ax=ax, color="green")
plt.show()
cm = pd.crosstab(val_df[target].values, preds_confusion, rownames=['Actual'], colnames=['Predicted'])
fig, (ax1) = plt.subplots(ncols=1, figsize=(5,5))
sns.heatmap(cm,
xticklabels=['Not Default', 'Default'],
yticklabels=['Not Default', 'Default'],
annot=True,ax=ax1,
linewidths=.2,linecolor="Darkblue", cmap="Blues")
plt.title('Confusion Matrix', fontsize=14)
plt.show()
Out of all the classifiers using decision trees it looks like the Gradient Boosting method has the best accuracy, while also giving a probability distribution at its output, which will come in handy when trying to rank all the clients that will delay their payments.
For a neural network we will need to balance both classes
class_1 = train_df[train_df['payment_delay']==1]
train_df= train_df[train_df['payment_delay']==0][:324]
train_df=train_df.append(class_1)
training_epochs = 10
minibatch_size = 40
sc = MinMaxScaler()
X_train, X_test = sc.fit_transform(train_df[predictors]), sc.fit_transform(val_df[predictors])
Y_train, Y_test = train_df[target], val_df[target]
X_train = torch.from_numpy(np.array(X_train)).float()
Y_train = torch.from_numpy(np.array(Y_train)).float()
X_test = torch.from_numpy(np.array(X_test)).float()
Y_test= torch.from_numpy(np.array(Y_test)).float()
train = data_utils.TensorDataset(X_train, Y_train)
train_loader = data_utils.DataLoader(train, batch_size=minibatch_size, shuffle=True)
test = data_utils.TensorDataset(X_test, Y_test)
test_loader = data_utils.DataLoader(test, batch_size=minibatch_size, shuffle=True)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(12, 40)
self.fc2 = nn.Linear(40, 20)
self.fc3 = nn.Linear(20, 10)
self.fc4 = nn.Linear(10, 1 )
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.sigmoid(self.fc4(x))
return x
dense_clf = Model()
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(dense_clf.parameters(), lr=3e-3)
for i in range(training_epochs):
for b, data in enumerate(train_loader, 0):
inputs, labels = data
y_pred = dense_clf(inputs)
y_pred = torch.reshape(y_pred, (-1,))
loss = loss_fn(y_pred, labels)
if b % 100:
print('Epochs: {}, batch: {} loss: {}'.format(i, b, loss))
#reset gradients
optimizer.zero_grad()
# backward pass
loss.backward()
# update weights
optimizer.step()
/Users/genitroi/opt/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/nn/functional.py:1639: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
Epochs: 0, batch: 1 loss: 0.7285537719726562 Epochs: 0, batch: 2 loss: 0.7250202894210815 Epochs: 0, batch: 3 loss: 0.6785116791725159 Epochs: 0, batch: 4 loss: 0.6827524900436401 Epochs: 0, batch: 5 loss: 0.7003644704818726 Epochs: 0, batch: 6 loss: 0.671264111995697 Epochs: 0, batch: 7 loss: 0.7042077779769897 Epochs: 0, batch: 8 loss: 0.712243914604187 Epochs: 0, batch: 9 loss: 0.6904342770576477 Epochs: 0, batch: 10 loss: 0.697802722454071 Epochs: 0, batch: 11 loss: 0.6878407001495361 Epochs: 0, batch: 12 loss: 0.6856421232223511 Epochs: 0, batch: 13 loss: 0.6941156983375549 Epochs: 0, batch: 14 loss: 0.691168487071991 Epochs: 0, batch: 15 loss: 0.6794129610061646 Epochs: 0, batch: 16 loss: 0.6811308264732361 Epochs: 1, batch: 1 loss: 0.6797451972961426 Epochs: 1, batch: 2 loss: 0.6801009178161621 Epochs: 1, batch: 3 loss: 0.6795646548271179 Epochs: 1, batch: 4 loss: 0.6853407621383667 Epochs: 1, batch: 5 loss: 0.6723469495773315 Epochs: 1, batch: 6 loss: 0.6738753914833069 Epochs: 1, batch: 7 loss: 0.6736747622489929 Epochs: 1, batch: 8 loss: 0.6738263964653015 Epochs: 1, batch: 9 loss: 0.685857355594635 Epochs: 1, batch: 10 loss: 0.6693999767303467 Epochs: 1, batch: 11 loss: 0.7022826075553894 Epochs: 1, batch: 12 loss: 0.6677065491676331 Epochs: 1, batch: 13 loss: 0.6546112298965454 Epochs: 1, batch: 14 loss: 0.6629194021224976 Epochs: 1, batch: 15 loss: 0.6545184254646301 Epochs: 1, batch: 16 loss: 0.6855237483978271 Epochs: 2, batch: 1 loss: 0.6822410821914673 Epochs: 2, batch: 2 loss: 0.6761938333511353 Epochs: 2, batch: 3 loss: 0.6755380034446716 Epochs: 2, batch: 4 loss: 0.6330102682113647 Epochs: 2, batch: 5 loss: 0.6328360438346863 Epochs: 2, batch: 6 loss: 0.6352663636207581 Epochs: 2, batch: 7 loss: 0.6068594455718994 Epochs: 2, batch: 8 loss: 0.6298763155937195 Epochs: 2, batch: 9 loss: 0.638727605342865 Epochs: 2, batch: 10 loss: 0.6576193571090698 Epochs: 2, batch: 11 loss: 0.6811193227767944 Epochs: 2, batch: 12 loss: 0.6530911922454834 Epochs: 2, batch: 13 loss: 0.6107818484306335 Epochs: 2, batch: 14 loss: 0.6764975190162659 Epochs: 2, batch: 15 loss: 0.6018844246864319 Epochs: 2, batch: 16 loss: 0.6868311762809753 Epochs: 3, batch: 1 loss: 0.6595735549926758 Epochs: 3, batch: 2 loss: 0.6646789312362671 Epochs: 3, batch: 3 loss: 0.6274805665016174 Epochs: 3, batch: 4 loss: 0.6467918753623962 Epochs: 3, batch: 5 loss: 0.5878967642784119 Epochs: 3, batch: 6 loss: 0.5683969259262085 Epochs: 3, batch: 7 loss: 0.6535037755966187 Epochs: 3, batch: 8 loss: 0.6618342399597168 Epochs: 3, batch: 9 loss: 0.7058497071266174 Epochs: 3, batch: 10 loss: 0.5899946689605713 Epochs: 3, batch: 11 loss: 0.5591994524002075 Epochs: 3, batch: 12 loss: 0.5423356294631958 Epochs: 3, batch: 13 loss: 0.6061860918998718 Epochs: 3, batch: 14 loss: 0.6026137471199036 Epochs: 3, batch: 15 loss: 0.6177447438240051 Epochs: 3, batch: 16 loss: 0.5519778728485107 Epochs: 4, batch: 1 loss: 0.593725323677063 Epochs: 4, batch: 2 loss: 0.5510996580123901 Epochs: 4, batch: 3 loss: 0.6458462476730347 Epochs: 4, batch: 4 loss: 0.6484696269035339 Epochs: 4, batch: 5 loss: 0.5857691764831543 Epochs: 4, batch: 6 loss: 0.5654327273368835 Epochs: 4, batch: 7 loss: 0.5438541173934937 Epochs: 4, batch: 8 loss: 0.5867815017700195 Epochs: 4, batch: 9 loss: 0.6935413479804993 Epochs: 4, batch: 10 loss: 0.6155340075492859 Epochs: 4, batch: 11 loss: 0.59988933801651 Epochs: 4, batch: 12 loss: 0.5150548815727234 Epochs: 4, batch: 13 loss: 0.6461114883422852 Epochs: 4, batch: 14 loss: 0.5886737704277039 Epochs: 4, batch: 15 loss: 0.5607965588569641 Epochs: 4, batch: 16 loss: 0.5512087941169739 Epochs: 5, batch: 1 loss: 0.5822216272354126 Epochs: 5, batch: 2 loss: 0.539408266544342 Epochs: 5, batch: 3 loss: 0.5590993762016296 Epochs: 5, batch: 4 loss: 0.5856240391731262 Epochs: 5, batch: 5 loss: 0.55791175365448 Epochs: 5, batch: 6 loss: 0.5639685392379761 Epochs: 5, batch: 7 loss: 0.5957358479499817 Epochs: 5, batch: 8 loss: 0.5768750905990601 Epochs: 5, batch: 9 loss: 0.5004059076309204 Epochs: 5, batch: 10 loss: 0.5269104242324829 Epochs: 5, batch: 11 loss: 0.5006337761878967 Epochs: 5, batch: 12 loss: 0.5552749633789062 Epochs: 5, batch: 13 loss: 0.6398079991340637 Epochs: 5, batch: 14 loss: 0.5458813309669495 Epochs: 5, batch: 15 loss: 0.6544061899185181 Epochs: 5, batch: 16 loss: 0.507091760635376 Epochs: 6, batch: 1 loss: 0.5570782423019409 Epochs: 6, batch: 2 loss: 0.5664411783218384 Epochs: 6, batch: 3 loss: 0.5891796946525574 Epochs: 6, batch: 4 loss: 0.5243016481399536 Epochs: 6, batch: 5 loss: 0.5338126420974731 Epochs: 6, batch: 6 loss: 0.5574967265129089 Epochs: 6, batch: 7 loss: 0.5628961324691772 Epochs: 6, batch: 8 loss: 0.5476564168930054 Epochs: 6, batch: 9 loss: 0.509372353553772 Epochs: 6, batch: 10 loss: 0.5149798393249512 Epochs: 6, batch: 11 loss: 0.5867450833320618 Epochs: 6, batch: 12 loss: 0.47604942321777344 Epochs: 6, batch: 13 loss: 0.6395967602729797 Epochs: 6, batch: 14 loss: 0.4769871234893799 Epochs: 6, batch: 15 loss: 0.6147444844245911 Epochs: 6, batch: 16 loss: 0.48170027136802673 Epochs: 7, batch: 1 loss: 0.532744288444519 Epochs: 7, batch: 2 loss: 0.5423635840415955 Epochs: 7, batch: 3 loss: 0.5070255398750305 Epochs: 7, batch: 4 loss: 0.5609080791473389 Epochs: 7, batch: 5 loss: 0.49981021881103516 Epochs: 7, batch: 6 loss: 0.5027421712875366 Epochs: 7, batch: 7 loss: 0.47577303647994995 Epochs: 7, batch: 8 loss: 0.61772620677948 Epochs: 7, batch: 9 loss: 0.5960713028907776 Epochs: 7, batch: 10 loss: 0.5478242039680481 Epochs: 7, batch: 11 loss: 0.5167720317840576 Epochs: 7, batch: 12 loss: 0.5153306722640991 Epochs: 7, batch: 13 loss: 0.5272911787033081 Epochs: 7, batch: 14 loss: 0.5109778046607971 Epochs: 7, batch: 15 loss: 0.47579264640808105 Epochs: 7, batch: 16 loss: 0.6350623369216919 Epochs: 8, batch: 1 loss: 0.5981428027153015 Epochs: 8, batch: 2 loss: 0.609207034111023 Epochs: 8, batch: 3 loss: 0.4971064627170563 Epochs: 8, batch: 4 loss: 0.36651310324668884 Epochs: 8, batch: 5 loss: 0.5871690511703491 Epochs: 8, batch: 6 loss: 0.5053865909576416 Epochs: 8, batch: 7 loss: 0.4875747263431549 Epochs: 8, batch: 8 loss: 0.532880425453186 Epochs: 8, batch: 9 loss: 0.4959374964237213 Epochs: 8, batch: 10 loss: 0.4301111102104187 Epochs: 8, batch: 11 loss: 0.5932532548904419 Epochs: 8, batch: 12 loss: 0.40942129492759705 Epochs: 8, batch: 13 loss: 0.6140817403793335 Epochs: 8, batch: 14 loss: 0.6329962611198425 Epochs: 8, batch: 15 loss: 0.469155877828598 Epochs: 8, batch: 16 loss: 0.5185096859931946 Epochs: 9, batch: 1 loss: 0.5019286870956421 Epochs: 9, batch: 2 loss: 0.4878710210323334 Epochs: 9, batch: 3 loss: 0.6288744211196899 Epochs: 9, batch: 4 loss: 0.46533337235450745 Epochs: 9, batch: 5 loss: 0.4744275212287903 Epochs: 9, batch: 6 loss: 0.5629171133041382 Epochs: 9, batch: 7 loss: 0.4962870478630066 Epochs: 9, batch: 8 loss: 0.49533432722091675 Epochs: 9, batch: 9 loss: 0.6972273588180542 Epochs: 9, batch: 10 loss: 0.5883045792579651 Epochs: 9, batch: 11 loss: 0.4153077006340027 Epochs: 9, batch: 12 loss: 0.4646751284599304 Epochs: 9, batch: 13 loss: 0.5622854828834534 Epochs: 9, batch: 14 loss: 0.4802885949611664 Epochs: 9, batch: 15 loss: 0.47888898849487305 Epochs: 9, batch: 16 loss: 0.3779045641422272
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
inputs, labels = data
outputs = dense_clf(inputs)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted.double() == labels).sum().item()
print('Accuracy of the network on the {} inputs: {}'.format(
X_test.shape[0], 100 * correct/total))
Accuracy of the network on the 600 inputs: 85.16666666666667
Given the current problem, it seems that a neural network is not the way to go.
We will use the XGBoost classifier to rank top 300 clients
task_df = pd.read_csv('test.csv')
task_df = task_df.applymap(yes2int)
input_data = xgb.DMatrix(task_df[predictors])
predictions = xgb_clf.predict(input_data)
output = pd.DataFrame(enumerate(predictions),columns=['index','payment_delays'])
results = output.sort_values(by=['payment_delays'],ascending=False)[:300]
results.to_csv('Results.csv',index=False)